/*===================================================================
infutor_matcher_kc.do
	*Started:	2020-Feb-03
	*Updated:	2021-Feb-11
	*Author:	David Phillips, Seth Zissette
	*Purpose:	Match King County clients to their Infutor records
===================================================================*/

/*README
	Before using this file, set the macros to update them throughout 
	the .do file.
	
	DONE? - [Y/N]
*/

clear

/*SETTING MACROS=====================================================
	mydirectory:		Your working directory (folder)
	datafilename:		The file name of the Excel data file
===================================================================*/
	*global mydirectory		[...]
	*global datafilename		[...]
	global extractname		mergedlast4_scc_ever_112021.dta
	global mydirectory		Z:\Infutor Projects\PII\Santa Clara County
	global datafilename		Example File.xlsx
	*global extractname ever_kc_ssn.dta
	cap ssc install freqindex
	cap ssc install matchit
	cd "$mydirectory"
	cap log close
	log using matchlog.smcl, replace
					

/*CLEANING "USING" FILE - FROM PROJECT===============================
	Prepare the project data for linkage to Infutor. Prepare
	variables that will be needed for matching: first name, last
	name, month of birth, year of birth, client ID, and scalars for
	each year of birth present in the data.
===================================================================*/
	clear
	cd "$mydirectory"
	
	use $extractname
	tempfile extract
	*drop yob_inf 
	gen yob_inf = floor(DOB/100)	
	replace yob_inf = DOB if DOB < 9999
	egen idnum = seq()	
	save `extract', replace
	clear

	import excel "$datafilename", first

	*Generating and formating first and last names
	gen First = upper(ClientsFirstName)
	gen Last = upper(ClientsLastName)

	*Generating and formating month and year of birth
	gen mob = month(ClientsDateofBirthDate)
	gen yob = year(ClientsDateofBirthDate)
	replace mob = . if mob < 1 | mob > 12
	replace yob = . if yob < 1900 | yob > 2005

	*Clean SSNs
	drop ClientsSSNLast4 
	replace ClientsSSN = "" if substr(ClientsSSN,1,3) == "000"
	replace ClientsSSN = "" if substr(ClientsSSN,1,3) == "999"	
	replace ClientsSSN = "" if ClientsSSN == "123-45-6789"
	
	gen SSN_last4 = substr(ClientsSSN,-4,.)
	*destring SSN_last4, replace force
	rename ClientsSSN SSN


	*Generating client ID
	rename HMISID LEOid
	egen clientid = seq()
	
	keep LEOid First Last yob clientid SSN_last4	

	tempfile temp_cleaned_using
	save `temp_cleaned_using', replace

	


	*Generating scalars for each reported year of birth
	summarize yob
	scalar minyear = r(min)
	scalar maxyear = r(max)
	forvalues y = `=minyear'(1)`=maxyear' {
		count if yob == `y'
		scalar count_`y' = r(N)
	}


	
	
/*MATCH RECORDS ON EACH YOB FROM INFUTOR SUBSET - SSN TO SSN=========
	Match client records with SSNs to Infutor records using last 4 of
	SSN.
===================================================================*/


	*Keep only those with SSNs
	drop if SSN_last4 == ""
	rename yob yob_inf
	tempfile temp_cleaned_ssn_ssn
	save `temp_cleaned_ssn_ssn'
	
	*Generating scalars for each reported year of birth
	summarize yob_inf
	scalar minyear = r(min)
	scalar maxyear = r(max)
	forvalues y = `=minyear'(1)`=maxyear' {
		count if yob_inf == `y'
		scalar count_`y' = r(N)
	}
	
	*matching by yob	
	forvalues y = `=minyear'(1)`=maxyear' {
		if `=count_`y'' > 0 {
			clear
			use `extract' if abs(yob_inf - `y') <= 0.5
			tempfile temp_inf_yob_subset_ssn_ssn
			save `temp_inf_yob_subset_ssn_ssn'
			
			clear
			use `temp_cleaned_ssn_ssn'
			keep if yob == `y'
			egen num = seq(), by(SSN_last4)
			reshape wide LEOid First Last yob_inf clientid, i(SSN_last4) j(num)
			merge 1:m SSN_last4 using `temp_inf_yob_subset_ssn_ssn' , keep(3) keepusing(PID_ First_Name_ Last_Name_ idnum)
			drop _merge
			reshape long LEOid First Last yob_inf clientid, i(PID_) j(num)
			drop num
			rename yob_inf yob
			
			if `y' != `=minyear' {
				append using `inf_kc_fuzzymatch_ssn_ssn'

			}
			tempfile inf_kc_fuzzymatch_ssn_ssn
			save `inf_kc_fuzzymatch_ssn_ssn'
		}
	}
	
	*Fuzzy match on names
	matchit First First_Name_, gen(similscore_first)
	matchit Last Last_Name_, gen(similscore_last)
	gen similscore_name = (similscore_first + similscore_last)/2
	
	*Save matched records
	keep if similscore_name >= 0.5
	gen SSNmatch=1
	tempfile inf_kc_fuzzymatch_ssn_ssn_done
	save `inf_kc_fuzzymatch_ssn_ssn_done'	
	
	
	
/*MATCH RECORDS ON EACH YOB FROM INFUTOR SUBSET - SSN TO NO SSN======
	Match client records with SSNs to Infutor records that do not
	have SSNs.
===================================================================*/
	clear
	use `temp_cleaned_using'
	drop if SSN_last4 == ""
	
	*Generating scalars for each reported year of birth
	summarize yob
	scalar minyear = r(min)
	scalar maxyear = r(max)
	forvalues y = `=minyear'(1)`=maxyear' {
		count if yob == `y'
		scalar count_`y' = r(N)
	}

	tempfile temp_cleaned_ssn_nossn
	save `temp_cleaned_ssn_nossn'	
	
	forvalues y = `=minyear'(1)`=maxyear' {
		if `=count_`y'' > 0 {
			*Pulling subsetted Infutor records for each year
			clear
			use `extract' if abs(yob_inf - `y') <= 0.5
			tempfile temp_inf_yob_subset_ssn_nossn
			save `temp_inf_yob_subset_ssn_nossn'			
			
			*Creating match score for first name from each file
			clear
			use `temp_cleaned_ssn_nossn'
			keep if yob == `y'
			matchit clientid First using `temp_inf_yob_subset_ssn_nossn', idusing(idnum) txtusing(First_Name_) threshold(0.5) override
			merge m:1 idnum using `temp_inf_yob_subset_ssn_nossn', nogen keepusing(PID_) keep(1 3)
			rename similscore similscore_first
			if `y' != `=minyear' {
				append using `temp_fn_ssn_nossn'
			}
			tempfile temp_fn_ssn_nossn
			save `temp_fn_ssn_nossn'
			
			*Creating match score for last name from each file
			clear 
			use `temp_cleaned_ssn_nossn'
			keep if yob == `y' 
			matchit clientid Last using `temp_inf_yob_subset_ssn_nossn', idusing(idnum) txtusing(Last_Name_) threshold(0.5) override
			merge m:1 idnum using `temp_inf_yob_subset_ssn_nossn', nogen keepusing(PID_) keep(1 3)
			rename similscore similscore_last
			if `y' != `=minyear' {
				append using `temp_ln_ssn_nossn'
			}
			tempfile temp_ln_ssn_nossn
			save `temp_ln_ssn_nossn'
		}
	}
	
	*Combining the two temporary name files
		*Note: This excludes records with first or last names that do
		*not match by matchit at all.
	clear
	use `temp_ln_ssn_nossn'
	merge 1:1 clientid idnum using `temp_fn_ssn_nossn'
	replace similscore_first = 0 if _merge == 1
	replace similscore_last = 0 if _merge == 2
	drop _merge

	*Calculating a cumulative similarity score by averaging scores
		*for first and last name
	gen similscore_name = (similscore_first + similscore_last)/2
	keep if similscore_name >= 0.5
	gsort clientid - similscore_name
	tempfile temp_combo_ssn_nossn
	save `temp_combo_ssn_nossn'

	*Merging back on original id
	merge m:1 clientid using `temp_cleaned_ssn_nossn', keepusing(LEOid yob SSN_last4) keep(1 3)
	drop if _merge == 2
	drop _merge

	*Appending back to previous matches
	gen SSNmatch = 0
	order LEOid First Last yob clientid SSN_last4 PID_ First_Name_ Last_Name_ idnum similscore_first similscore_last similscore_name SSNmatch
	tempfile temp_combo_ssn_nossn_toappend
	save `temp_combo_ssn_nossn_toappend'
	clear
	use `inf_kc_fuzzymatch_ssn_ssn_done'
	append using `temp_combo_ssn_nossn_toappend'
	tempfile inf_kc_fuzzymatch_ssn_nossn
	save `inf_kc_fuzzymatch_ssn_nossn'
	
	*Removing records of clients matching the same person to the same Infutor record twice
	gsort clientid PID_ -SSNmatch
	drop if clientid == clientid[_n-1] & PID_ == PID[_n-1]
	tempfile inf_kc_fuzzymatch_ssns_done
	save `inf_kc_fuzzymatch_ssns_done'



/*MATCH RECORDS ON EACH YOB FROM INFUTOR SUBSET - NO SSN TO ANY======
	Match client records with no SSNs to Infutor records.
===================================================================*/
	clear
	use `temp_cleaned_using'
	keep if SSN_last4 == ""
	
	*Generating scalars for each reported year of birth
	summarize yob
	scalar minyear = r(min)
	scalar maxyear = r(max)
	forvalues y = `=minyear'(1)`=maxyear' {
		count if yob == `y'
		scalar count_`y' = r(N)
	}

	tempfile temp_cleaned_nossn_any
	save `temp_cleaned_nossn_any'
	
	forvalues y = `=minyear'(1)`=maxyear' {
		if `=count_`y'' > 0 {
			*Pulling subsetted Infutor records for each year
			clear
			use  `extract' if abs(yob_inf - `y') <= 0.5
			tempfile temp_inf_yob_subset_nossn_any
			save `temp_inf_yob_subset_nossn_any'
		
			*Creating match score for first name from each file
			clear
			use `temp_cleaned_nossn_any'
			keep if yob == `y'
			matchit clientid First using `temp_inf_yob_subset_nossn_any', idusing(idnum) txtusing(First_Name_) threshold(0.5) override
			merge m:1 idnum using `temp_inf_yob_subset_nossn_any', nogen keepusing(PID_) keep(1 3)
			rename similscore similscore_first
			if `y' != `=minyear' {
				append using `temp_fn_nossn_any'
			}
			tempfile temp_fn_nossn_any
			save `temp_fn_nossn_any'
			
			*Creating match score for last name from each file
			clear 
			use `temp_cleaned_nossn_any'
			keep if yob == `y' 
			matchit clientid Last using `temp_inf_yob_subset_nossn_any', idusing(idnum) txtusing(Last_Name_) threshold(0.5) override
			merge m:1 idnum using `temp_inf_yob_subset_nossn_any', nogen keepusing(PID_) keep(1 3)
			rename similscore similscore_last
			if `y' != `=minyear' {
				append using `temp_ln_nossn_any'
			}
			tempfile temp_ln_nossn_any
			save `temp_ln_nossn_any'
		}
	}

	*Combining the two temporary name files
		*Note: This excludes records with first or last names that do
			*not match by matchit at all.
	clear
	use `temp_ln_nossn_any'
	merge 1:1 clientid idnum using `temp_fn_nossn_any'
	replace similscore_first = 0 if _merge == 1
	replace similscore_last = 0 if _merge == 2
	drop _merge

	*Calculating a cumulative similarity score by averaging scores
		*for first and last name
	gen similscore_name = (similscore_first + similscore_last)/2
	keep if similscore_name >= 0.5
	gsort clientid - similscore_name
	tempfile temp_combo_nossn_any
	save `temp_combo_nossn_any'

	*Merging back on original id
	merge m:1 clientid using `temp_cleaned_nossn_any', keepusing(LEOid yob SSN_last4)
	drop if _merge == 2
	drop _merge
	
	*Appending back to previous matches
	gen SSNmatch = -1
	order LEOid First Last yob clientid SSN_last4 PID_ First_Name_ Last_Name_ idnum similscore_first similscore_last similscore_name SSNmatch
	tempfile temp_combo_nossn_any_toappend
	save `temp_combo_nossn_any_toappend'
	clear
	use `inf_kc_fuzzymatch_ssns_done'
	append using `temp_combo_nossn_any_toappend'
	gen anyssn = (SSN_last4 ~= "")
	tempfile inf_kc_fuzzymatched
	save `inf_kc_fuzzymatched'


	
/*MERGE RECORDS BACK ONTO INFUTOR RECORDS============================
	Merge matched records back onto Infutor data.
===================================================================*/
	clear
	use `inf_kc_fuzzymatched'
	
	*Generating scalars for each reported year of birth
	summarize yob
	scalar minyear = r(min)
	scalar maxyear = r(max)
	
	
	tempfile finalmatch
	
	
	if minyear ~= . {
		
		forvalues y = `=minyear'(1)`=maxyear' {
			count if yob == `y'
			scalar count_`y' = r(N)
		}
		
		forvalues y = `=minyear'(1)`=maxyear' {
			if `=count_`y'' > 0 {
				clear
				use `extract' if abs(yob_inf - `y') <= 0.5
				tempfile temp_inf_yob_subset_final
				save `temp_inf_yob_subset_final'
				
				clear
				use `inf_kc_fuzzymatched'
				keep if yob == `y'
				merge m:1 idnum using `temp_inf_yob_subset_final' , keep(3) keepusing(Gender_ DeceasedCD_ Fips_County* City_* State_* Zipcode_* Effective_Date_* Last_Verification_Address_*)
				keep LEOid PID_ clientid anyssn similscore* Gender_ DeceasedCD_ Fips_County* City_* State_* Zipcode_* Effective_Date_* Last_Verification_Address_* SSNmatch
				
				if `y' != `=minyear' {
					append using `finalmatch'
				}
				save `finalmatch', replace
			}
		}
		
	}
	
	duplicates report LEOid PID_
	drop PID_
	
	save inf_scc_fuzzymatch_final.dta, replace

	
	log close
	
	
	